clear all
capture set more off

*******************************************************************************************************************************************
// Admin
*******************************************************************************************************************************************
/*
	// Set directory path
	foreach i in "C:/Users/Andy/Dropbox/projects/WWI German Discrimination"		///
				 "C:/Users/anf137/Dropbox/projects/WWI German Discrimination"	///
				 "Z:/Dropbox/projects/WWI German Discrimination"				{
		
				global path "`i'"
				confirmdir "$path"
				if `r(confirmdir)'==0 continue, break
	}
*/

*******************************************************************************************************************************************
// Merge linked Census data with casualty, geographic, and aggregate Census controls and generate relevant variables
*******************************************************************************************************************************************

	* load the linked 1910-20 Census data for German individuals
	use "$path/Replication/cleaned_data/germans1910-20_linked.dta"

	* make sure each person is unique
	duplicates tag histid_1910 year, gen(tag10)
	duplicates tag histid_1920 year, gen(tag20)
	drop if tag10==1 | tag20==1
	drop tag10 tag20
	
	* generate county variable
	gen county = countyicp
	gen stcounty = 10000*statefip + county
	
	* fix county fips codes that changed between 1910-20
	replace stcounty = stcounty/10
	ren stcounty fips

	qui do "$path/Replication/code/standardize_county_boundaries_to_1910.do"  
	
	* merge with the WWI casualty information by county
	merge m:1 fips using "$path/Replication/cleaned_data/WWI casualties Army Navy clean - county.dta"
	keep if _merge==3
	drop _merge
	
	* merge with the WWI draft information by county
	gen statefips = statefip
	merge m:1 statefips county using "$path/Replication/cleaned_data/WWI inductions by county - clean.dta" 
	drop if _merge==2
	drop _merge
	
	* merge with granular age distribution info in 1910
	merge m:1 fips using "$path/Replication/raw_data/age_distribution.dta"
	drop if _merge==2
	drop _merge
	
	* merge with latitude and longitude data
	gen icpsrfip = fips
	merge m:1 icpsrfip county using "$path/Replication/raw_data/county_lat_lon.dta"
	drop if _merge==2
	drop _merge
	
	* add German names
	merge m:1 histid_1910 histid_1920 year using "$path/Replication/raw_data/linked_german_names_anonymized.dta"
	drop if _merge==2
	drop _merge

	
	* sum men aged 18-45 during the war years
	egen age18_45_amm = rowtotal(am_10- am_37)
	
	* generate the top quintile casualty rate variable times post
	gen casrate2 = dead_all/age18_45_amm*100 if year==1910
	egen casualtyrate2 = max(casrate2), by(id)
	replace casualtyrate2 = 0 if year==1910
	
	xtile k2 = casualtyrate2, nq(5)
	gen treat = (k2==5)*(year==1920)
	
	* draft rate times post
	gen drate2 = tot_accepted_num/age18_45_amm*100 if year==1910
	replace drate2 = 100 if drate2>100 & drate!=.
	egen draftrate2 = max(drate2), by(id)
	replace draftrate2 = 0 if year==1910
	drop k2 drate2 all_* am_*
	
	ren (casualtyrate2 draftrate2) (casualtyrate draftrate)
	
	// dummy for whether individual moved county between 1910 and 1920
	egen moved = sd(fips), by(id)

	cap drop mover
	gen mover = moved!=0

	replace mover = 0 if year==1910


	// define skill groups and label them for convenience
	gen skill =  1*(occ1950<100) 				+ /// 
				 2*(occ1950>=100 & occ1950<200)	+ ///
				 3*(occ1950>=200 & occ1950<300)	+ ///
				 4*(occ1950>=300 & occ1950<400)	+ ///
				 5*(occ1950>=400 & occ1950<500)	+ ///
				 6*(occ1950>=500 & occ1950<600)	+ ///
				 7*(occ1950>=600 & occ1950<700)	+ ///
				 8*(occ1950>=700 & occ1950<800)	+ ///
				 9*(occ1950>=800 & occ1950<900)	+ ///
				10*(occ1950>=900 & occ1950<979) + ///
				11*(occ1950>=979 & occ1950!=.)

	label def skillab 1  "Professional, Technical"			///
					  2  "Farmers"				  			///
					  3  "Managers, Officials, Proprietors"	///
					  4  "Clerical and Kindred"				///
					  5  "Sales workers"					///
					  6  "Craftsmen"						///
					  7  "Operatives"						///
					  8  "Service Workers (priv househ)"	///
					  9  "Farm Laborers"					///
					  10 "Laborers"							///
					  11 "Non-occupational response"

	label val skill skillab
	
	// define industry groups and label them for convenience
	gen industry =  1*(ind1950>=100 & ind1950<127) + ///
					2*(ind1950>=127 & ind1950<240) + ///
					3*(ind1950>=240 & ind1950<300) + ///
					4*(ind1950>=300 & ind1950<400) + ///
					5*(ind1950>=400 & ind1950<500) + ///
					6*(ind1950>=500 & ind1950<569) + ///
					7*(ind1950>=569 & ind1950<580) + ///
					8*(ind1950>=580 & ind1950<600) + ///
					9*(ind1950>=600 & ind1950<628) + ///
				   10*(ind1950>=628 & ind1950<700) + ///
				   11*(ind1950>=700 & ind1950<757) + ///
				   12*(ind1950>=757 & ind1950<818) + ///
				   13*(ind1950>=818 & ind1950<850) + ///
				   14*(ind1950>=850 & ind1950<860) + ///
				   15*(ind1950>=860 & ind1950<900) + ///
				   16*(ind1950>=900 & ind1950<947) + ///
				   17*(ind1950>=947 & ind1950<979) + ///
				   18*(ind1950>=979 & ind1950!=.)

	label def indlab 0  "N/A or none reported"				///
					 1  "Agriculture, Forestry, Fishing"	///
					 2  "Mining"							///
					 3  "Construction"						///
					 4  "Manufacturing (dur.)"				///
					 5  "Manufacturing (nondur.)"			///
					 6  "Transportation"					///
					 7  "Telecommunications"				///
					 8  "Utilities, Sanitary Services"		///
					 9  "Wholesale Trade"					///
					 10 "Retail Trade"						///
					 11 "Finance, Insurance, Real Estate"	///
					 12 "Business and Repair Services"		///
					 13 "Personal services"					///
					 14 "Entertainment/Recreation Services"	///
					 15 "Professional and Related Services"	///
					 16 "Public Administration"				///
					 17 "Not yet specified"					///
					 18 "Others"

	label val industry indlab
	
	// generate 1910 baseline characteristics
	qui foreach var in urban skill industry farm  classwkr lit marst yrsusa2 statefip empstat famsize wksunemp  school labforce  fips {
		
		gen `var'_temp = `var' if year==1910
		egen `var'1910 = max(`var'_temp), by(id)
		replace `var'1910 = 0 if year==1910
		label val `var'1910 `var'_lbl
		drop `var'_temp
	}	
	
	// define baseline categories for 1910 variables interacted with 1920 dummy
	replace urban1910    = 0 if urban1910==1	 // rural
	replace skill1910    = 0 if skill1910==11	 // non-occ response
	replace farm1910     = 0 if farm1910==1		 // non-farm
	replace empstat1910  = 0 if empstat1910==30	 // not in labor force
	replace lit1910 	 = 0 if lit1910==1		 // illiterate
	replace marst1910	 = 0 if marst1910==6	 // single
	replace yrsusa21910  = 0 if yrsusa21910==5	 // 21+ yrs
	replace statefip1910 = 0 if statefip1910==48 // Texas
	replace fips1910 	 = 0 if fips1910==42003	 // Allegheny county, Pennsylvania
	replace school1910 	 = 0 if school1910==1	 // not in school
	replace labforce1910 = 0 if labforce1910==1	 // not in labor force

	label val industry1910 indlab
	label val skill1910 skillab
	
	* does not speak English in 1910
	gen nospeakeng10 = speakeng==1 & year==1910
	egen maxnospeakeng10 = max(nospeakeng10), by(id)

	* manufacturing worker
	gen mfg10 = industry1910==4 | industry1910==5
	egen maxmfg10 = max(mfg10), by(id)

	* indicator for movers
	egen maxmover = max(mover), by(id)

	* distance moved in miles
	gen point_x10 = point_x if year==1910
	gen point_y10 = point_y if year==1910
	egen point_x1910 = max(point_x10), by(id)
	egen point_y1910 = max(point_y10), by(id)

	geodist point_y point_x point_y1910 point_x1910 if year==1920, gen(dist) mi
	replace dist = 0 if year==1910
	
	* dummy for no naturalization
	gen nonat = citizen==3 | citizen==4
	
	* dummy for no English language skills
	gen noeng = speakeng==1
	
	* dummy for changing from non-agricultural (1910) to agricultural job (1920)	
	egen agri10 = max(industry1910==1), by(id)
	gen farmer = industry==1 & year==1920 & agri10==0
	
	* dummy for leaving the midwest from 1910 to 1920
	gen midwest = statename=="Ohio" | statename=="Indiana" | statename=="Michigan" | statename=="Illinois" | statename=="Wisconsin" | statename=="Missouri" | statename=="Iowa" | statename=="Minnesota" | statename=="North Dakota" | statename=="South Dakota" | statename=="Nebraska" | statename=="Kansas"
	gen notmidwest = midwest==0
	gen notmidwest20 = notmidwest==1 & year==1920

	* make sure the sample is consistent, i.e. each individual is unique, available in both 1910 and 1920, and no missings in key variables
	global controls "c.draftrate i.urban1910 i.skill1910 i.farm1910 i.empstat1910 i.lit1910 i.marst1910 i.yrsusa21910 c.famsize1910 i.school1910 i.labforce1910 c.wksunemp1910"

	gen missdist = dist==.
	replace dist = 0 if dist==.

	reghdfe dist treat $controls missdist i.year if maxmover==1 & abe_nysiis_standard==1, a(fips1910 birthyr bpl) cluster(fips)
	gen K = e(sample)
	egen sumK = sum(K), by(id)

	reghdfe mover treat $controls i.year if abe_nysiis_standard==1, a(fips1910 birthyr bpl) cluster(fips)
	gen J = e(sample)
	egen sumJ = sum(J), by(id)

	gen D = (sumK==2 & sumJ==2) | (sumK==0 & sumJ==2)
	keep if D==1
	
	compress
	save "$path/Replication/cleaned_data/linked_Germans_estimation_sample.dta", replace
	clear
	